Assignment - 2

Team Name : Team APP

Members :
Pradyumna YM PES1201700986
Anush V Kini PES1201701646
Punit Pranesh Koujalgi PES1201701502

#packages used in this assignment
#install.packages("plotly")
#install.packages("tm")
#install.packages("wordcloud")
#install.packages("syuzhet")

#Loading Packages
library(syuzhet)
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# path to the Dataset
path='D:/dataset'
setwd(path)

Question-1

Library plotly helps make brilliant visualizations that are highly interactive and appealing.

  1. Use this library to visualize all the parameters in fitness_data.csv(except activityID,subID and timestamp(s)) against timestamp(x-axis) for each subID.
  2. To represent in a single graph, use dropdown menus. First dropdown menu will be used to select the subID. Second dropdown menu will be used to select the column to be analyzed.
    (For eg: User wants to visualize heart rate activity column(dropdown 1) for subID 104(dropdown 2))
  3. Use a slider to control the range of timestamp.

#read the dataset
fitness_data <- read.csv("fitness_data.csv")

#remove nan or invalid data points
fitness_data <- na.omit(fitness_data)

#create a plotly plot
# add_trace is used to add all the plots to the main plot.
# a transform is used to select the required subject
# update dropdown menus in the layout are used to switch between plots.
# a rangeslider has been used to select a required interval of timestamps.
p <- plot_ly(fitness_data, x = ~timestamp..in.seconds.,
             transforms = list(
               list( 
               type = 'filter',
               target = ~subID,
               operation = '=',
               value = unique(fitness_data$subID)[1]))) %>% 
add_trace(y = ~heartrate.during.activity..bpm., mode = "lines") %>% 
add_trace(y = ~Body.Temperature..Celsius., visible = F,mode = "lines")%>%
add_trace(y = fitness_data$normal.HR..bpm., visible = F , mode = "lines")%>%
add_trace(y = fitness_data$Acceleration..x.axis..in.m.s2, visible = F, mode = "lines")%>%
add_trace(y = fitness_data$Gyroscope..x.axis..in.rad.s,visible = F, mode = "lines")%>%
add_trace(y = fitness_data$Magnetometer..x.axis..in.ÃŽ.T,visible = F, mode = "lines")%>%
add_trace(y = fitness_data$Acceleration..y.axis..in.m.s2, visible = F, mode = "lines")%>%
add_trace(y = fitness_data$Gyroscope..y.axis..in.rad.s,visible = F, mode = "lines")%>%
add_trace(y = fitness_data$Magnetometer..y.axis..in.ÃŽ.T,visible = F, mode = "lines")%>%
add_trace(y = fitness_data$Acceleration..z.axis..in.m.s2, visible = F, mode = "lines")%>%
add_trace(y = fitness_data$Gyroscope..z.axis..in.rad.s,visible = F, mode = "lines")%>%
add_trace(y = fitness_data$Magnetometer..z.axis..in.ÃŽ.T,visible = F, mode = "lines")%>%
rangeslider(start = 1500,end = 1900)%>%
layout(title = 'Fitness_data',
         xaxis = list(title = "Timestamp"),
         yaxis = list(title = "Heart Rate"),
         showlegend = FALSE,
         updatemenus = list(
           list(
             y = 1.1,
        buttons = list(
          list(method = "update",
               args = list(list(visible= list(T,F,F,F,F,F,F,F,F,F,F,F)),
                           list(yaxis = list(title = "Heart Rate"))),
               label = "Heart Rate"),

          list(method = "update",
               args = list(list(visible=  list(F,T,F,F,F,F,F,F,F,F,F,F)),
                           list(yaxis = list(title = "Body Temp."))),
               label = "Body Temp."),
          
          list(method = "update",
               args = list(list(visible=  list(F,F,T,F,F,F,F,F,F,F,F,F)),
                           list(yaxis = list(title = "Normal HR"))),
               label = "Normal HR"),
          
          list(method = "update",
               args = list(list(visible= list(F,F,F,T,F,F,F,F,F,F,F,F)),
                           list(yaxis = list(title = "Acceleration X"))),
               label = "Acceleration. X"),
          
          list(method = "update",
               args = list(list(visible=  list(F,F,F,F,T,F,F,F,F,F,F,F)),
                           list(yaxis = list(title = "Gyroscope X"))),
               label = "Gyroscope. X"),
          
          list(method = "update",
               args = list(list(visible=  list(F,F,F,F,F,T,F,F,F,F,F,F)),
                           list(yaxis = list(title = "Magetometer X"))),
               label = "Magetometer. X"),
          list(method = "update",
               args = list(list(visible= list(F,F,F,F,F,F,T,F,F,F,F,F)),
                           list(yaxis = list(title = "Acceleration Y"))),
               label = "Acceleration. Y"),
          
          list(method = "update",
               args = list(list(visible=  list(F,F,F,F,F,F,F,T,F,F,F,F)),
                           list(yaxis = list(title = "Gyroscope Y"))),
               label = "Gyroscope. Y"),
          
          list(method = "update",
               args = list(list(visible=  list(F,F,F,F,F,F,F,F,T,F,F,F)),
                           list(yaxis = list(title = "Magetometer Y"))),
               label = "Magetometer. Y"),
          list(method = "update",
               args = list(list(visible= list(F,F,F,F,F,F,F,F,F,T,F,F)),
                           list(yaxis = list(title = "Acceleration Z"))),
               label = "Acceleration. Z"),
          
          list(method = "update",
               args = list(list(visible=  list(F,F,F,F,F,F,F,F,F,F,T,F)),
                           list(yaxis = list(title = "Gyroscope Z"))),
               label = "Gyroscope. Z"),
          
          list(method = "update",
               args = list(list(visible=  list(F,F,F,F,F,F,F,F,F,F,F,T)),
                           list(yaxis = list(title = "Magetometer Z"))),
               label = "Magetometer. Z")
          )
        ),
        list(
              y= 0.95,
              x= -0.1,
             type = 'dropdown',
             active = 0,
             buttons = apply(as.data.frame(unique(fitness_data$subID)), 1, 
                             function(x) list(method = 'restyle',args = list('transforms[0].value',x),label = x)))
         ))

#show the plot
p
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter

Question-1

1 d. What insights could you glean from this plot?

As we see, from the normal heart rates, subjects 104 and 106 have a higher normal heart rate than the other subjects. THis indicates that these individuals are relatively younger than the other subjects, as younger people tend to have a higher normal heart rate. Reference : heart.org.
We notice that the heart rate during physical activities can increase to almost double the normal heart rate. From what we see, the first activity for each subject is not a physically intensive activity as we see that the heart rates are pretty normal compared to the other activities.
The above point is reinforced by the nature of the plots of the IMU sensor readings. We see that the acceleration plot varies relatively more in the later parts of the plot than the initial part of the plot.
If we look at the body temperature plots for all the subjects, we see that initially, the body temperature is gradually increasing, but still in the normal temperature range. But, after about 1600th timestamp, we see that the temperature abruptly drops first and then gradually keeps dropping till about 2500th time stamp. This gives us the impression that the activity performed during that interval is performed in cool environments, such as Swimming or Ice Skating.
The subject with the subject ID 103 seems to be the most physically fit since he/she has taken part in some activity through the entire time period, something which cant be observed in the plots of other subjects.

Question-2

What is undersampling and oversampling? Consider the dataset subject.csv. Is there a case ofundersampling or oversampling? If so, mention a technique to remedy the problem. Justify youranswer.

subject_data <- read.csv("subject.csv")
plot(subject_data$Sex,
     main = "Barplot of number of Data Points by gender",
     xlab= "Gender",
     ylab="Number of Data Points")

Oversampling and Undersampling are two techniques used to adjust the class distribution of a dataset. That is, to maintain a specific ratio between the number of data points for each class in a sample data, Which will help a model to learn the classification problem better.

Oversampling is used when the data collected is not sufficient. When one of the classes are not represented well. In such cases, oversampling methods can be used to duplicate the data points in order to obtain a more balanced number of data points for each class in order to help train a model.

On the other hand, undersampling is used when the data collected is sufficient. When one of the classes is overrepresented, undersampling is used on data points belonging to this class in order to balance it with minority classes.

An oversampling technique that can be used is SMOTE. SMOTE stands for Synthetic Minority Oversampling Technique. The module works by generating new instances from existing minority cases. The algorithm takes samples of the feature space for each target class and its nearest neighbors, and generates new examples that combine features of the target case with features of its neighbors. This approach increases the features available to each class and makes the samples more general. But in this case, it will not work as there is only a single female sample.

Question-3

There are various techniques for sampling data. Suggest a sampling technique that you think is ideal for the data in fitness_data.csv, and justify your choice.

The best approach would be to first divide the data into various strata based on the activity. within each stratum, Systematic sampling can be used, where the data will be sorted based on the timestamp, in order to reduce the sample size while still retaining the relative order of the data points, which is important for attributes such as the heart rate.

Question-4

In August 2018, Election Commission of India made Lok sabha 2014( Lok Sabha-2014data.csv ) data public so that analysts can use it for 2019 Lok Sabha election. Provide asuitable visualisation that accounts for the distribution of votes across the country.

#read the dataset
ds <- read.csv("Lok Sabha-2014 data.csv")

#find out the top 5 parties by number of winners 
top5party = names(tail(sort(table(ds$PARTY)),5))

#assign each part a color
colorParty = list("Bharatiya Janata Party" = "orange","All India Trinamool Congress" = "violet","All India Anna Dravida Munnetra Kazhagam" = "red", "Other" = "grey","Indian National Congress" = "green","Biju Janata Dal" = "mediumturquoise")

#declare a vector to be used as the party color
vec = 1:length(ds$PARTY)

#convert factor to character type
ds$PARTY = as.character(ds$PARTY)

#assign each row a color
for(i in 1:length(ds$PARTY)) {
  if(!(ds$PARTY[i] %in% top5party)) {
    vec[i] = colorParty$"Other"
  }else {
    vec[i] = colorParty[[ds$PARTY[i]]]
  }
}

#assign a new column to vec in the dataframe
ds$partyColors= vec

#Change the party name for each minor party to other
for(i in 1:length(ds$PARTY)) {
  if(!(ds$PARTY[i] %in% top5party)) {
    ds$PARTY[i] ="Other"
  }
}

#set the layout parameters such as initial position.
g <- list(
  scope = 'asia',
  projection = list(type = 'asia'),
  showland = TRUE,
  landcolor = toRGB("gray95"),
  subunitcolor = toRGB("gray85"),
  countrycolor = toRGB("gray85"),
  countrywidth =1,
  subunitwidth = 1,
  lonaxis = list(range = c(68,97)),
  lataxis = list(range = c(8,37))
  
)

#plot the points on the map and add markers with the party color. Pass the layout parameters.
plot_geo(ds , lat = ~latitude, lon = ~longitude)%>%
  add_markers(
    text = ~paste(" Constituency:",CONSTITUENCY,"<br />", "Winner:",WINNER.NAME,"<br />","Win Margin:",MARGIN,"Votes", "<br />","Party:",PARTY),
    color = ds$PARTY, symbol = I("square"), size = I(8), hoverinfo = "text",colors = unlist(colorParty), showlegend = T
  )%>%
  layout( 
    title = 'India Lok Sabha Elections Constituency Map                 Legend', geo = g
  )

Question-5

Many good Bollywood movies were released in 2019, one of them being Kabir Singh. The filetweets.txt. contains what people have tweeted about this movie. Provide suitable visualizationthat depicts the generals sentiment of the audience.

#read the file line by line
text <- readLines("tweets.txt",encoding = "UTF-8")

#create a corpus of all the lines
corpus <- Corpus(VectorSource(text))

#define a filter function to remove unwanted characters/patterns.


toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))

#remove spaces
corpus<-tm_map(corpus,stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation
## drops documents
#convert all words to lowercase
corpus<-tm_map(corpus,tolower)
## Warning in tm_map.SimpleCorpus(corpus, tolower): transformation drops
## documents
#remove all numbers and punctuations, and stopwords
corpus<-tm_map(corpus,removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
corpus<-tm_map(corpus,removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation
## drops documents
corpus<-tm_map(corpus,removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
#remove unwanted characters
corpus<-tm_map(corpus,toSpace,"ã")
## Warning in tm_map.SimpleCorpus(corpus, toSpace, "ã"): transformation drops
## documents
corpus<-tm_map(corpus,toSpace,"â")
## Warning in tm_map.SimpleCorpus(corpus, toSpace, "â"): transformation drops
## documents
#remove unwanted words
corpus <- tm_map(corpus , removeWords , c("and", "the", "our", "that", "for","are","also","more","has","have","should","this","with","ã\u0082â"," ã¢â\u0080â¦","kabirsingh","shahidkapoor","kabir","singh"," ã¢â\u0080â¦","advanikiara","will","kabirsinghreview","arjunreddy","arjun","now","reddy","kabirsinghmovie","can","one","day","sir","box","kapoor","imvangasandeep","kiaraadvani","movie","shahid","kabirsinghmoviereview","itsbhushankumar","things","everyone","said","advani","muradkhetani","screen","S","t","s","just","film","hai","kiara","kabirsinghboxoffice","watching","office","watched","tseries"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, c("and", "the",
## "our", : transformation drops documents
#tdm <-TermDocumentMatrix (corpus) #Creates a TDM

#TDM1<-as.matrix(tdm) #Convert this into a matrix format

#v = sort(rowSums(TDM1), decreasing = TRUE) #Gives you the frequencies for every word
#head(v,150)

#draw a wordcloud
wordcloud(corpus,scale=c(2.5,0.5), max.words=150, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))

#PLEASE NOTE THAT THIS WILL TAKE AROUND 3-4 MINS to process.
#Get the general sentiment of each tweet.
sent <- get_nrc_sentiment((text))

#plot the obtained sentiments
plt_sentiments =  function(){
    barplot(as.matrix(sent),
            las = 2,
            col = "blue",
            main = "A barplot of various Sentiments",
            cex.axis = 0.6,
            xlab = "",
            ylab = ""
            )
    title(ylab="Number of People", line=3.3)
    title(xlab = "Sentiment",line=4.25)
}

plt_sentiments()